Model-3 Bidirectional LSTM model using embedding layer

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score,f1_score
import time
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
In [3]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/preprocessed.csv')

print(df.head())
                                               Tweet  Sentiment  \
0  @_angelica_toy Happy Anniversary!!!....The Day...          1   
1  @McfarlaneGlenda Happy Anniversary!!!....The D...          1   
2  @thevivafrei @JustinTrudeau Happy Anniversary!...          1   
3  @NChartierET Happy Anniversary!!!....The Day t...          1   
4  @tabithapeters05 Happy Anniversary!!!....The D...          1   

                                     stemmed_content  
0  angelica toy happi anniversari day freedumb di...  
1  mcfarlaneglenda happi anniversari day freedumb...  
2  thevivafrei justintrudeau happi anniversari da...  
3  nchartieret happi anniversari day freedumb die...  
4  tabithapet happi anniversari day freedumb die ...  
In [4]:
# Drop rows with NaN values in the 'Tweet' column
df.dropna(subset=['Tweet'], inplace=True)
In [5]:
df.Sentiment.value_counts()
Out[5]:
4    233700
2     77015
1     64004
3     42556
0     34056
Name: Sentiment, dtype: int64
In [6]:
# Define a dictionary to map numerical labels to sentiment names
sentiment_mapping = {
    4: 'Strong_Pos',
    3: 'Strong_Neg',
    2: 'Neutral',
    1: 'Mild_Pos',
    0: 'Mild_Neg'
}

df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)
In [7]:
# Count the occurrences of each unique sentiment
sentiment_counts = df['Sentiment'].value_counts()

# Plotting
plt.figure(figsize=(10, 6))

# Plotting the bar chart with a different color (green)
sentiment_counts.plot(kind='bar', color='green')

# Adding title and labels
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
# Display the plot
plt.show()
In [8]:
df.Sentiment.value_counts()
Out[8]:
Strong_Pos    233700
Neutral        77015
Mild_Pos       64004
Strong_Neg     42556
Mild_Neg       34056
Name: Sentiment, dtype: int64
In [9]:
# Define input and output columns
X = df['Tweet']
y = df['Sentiment']
In [10]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
In [11]:
# Pad sequences
max_sequence_length = max([len(seq) for seq in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
In [12]:
sentiment_mapping = {'Strong_Pos' : 4 , 'Neutral' : 3 ,'Mild_Pos': 2 ,'Strong_Neg' : 1 ,'Mild_Neg': 0 }
y_categorical = [sentiment_mapping[sentiment] for sentiment in y]
y_categorical = to_categorical(y_categorical)
In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

Bidirectional LSTM model using embedding layer

Model Configuration and Training

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# Define vocabulary size (number of unique words)
num_words = len(tokenizer.word_index) + 1  # Add 1 for the padding token

# Define embedding dimension
embedding_dim = 50

# Define max sequence length
max_sequence_length = 83

# Define number of classes
num_classes = len(sentiment_mapping)

# Define model architecture
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dropout(0.5))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

start_time = time.time()
history = model.fit(X_train, y_train, epochs=2, batch_size=128, validation_split=0.2)
end_time = time.time()

train_time = end_time - start_time
print("Training Time:", train_time, "seconds")
Epoch 1/2
2257/2257 [==============================] - 3788s 2s/step - loss: 0.7096 - accuracy: 0.7412 - val_loss: 0.5059 - val_accuracy: 0.8174
Epoch 2/2
2257/2257 [==============================] - 3584s 2s/step - loss: 0.4230 - accuracy: 0.8468 - val_loss: 0.4981 - val_accuracy: 0.8225
Training Time: 7410.188728094101 seconds

Training Data Check

In [15]:
# Generate predictions for training data
y_train_pred = model.predict(X_train)
y_train_pred_classes = np.argmax(y_train_pred, axis=1)

# Compute AUC for each class
auc_positive = roc_auc_score(y_train[:, 1], y_train_pred[:, 1])
auc_negative = roc_auc_score(y_train[:, 0], y_train_pred[:, 0])

print("AUC for Positive Class:", auc_positive)
print("AUC for Negative Class:", auc_negative)

# Construct confusion matrix for training data
cm_train = confusion_matrix(np.argmax(y_train, axis=1), y_train_pred_classes)
print("Confusion Matrix for Training Data:")
print(cm_train)

# Classification Report for training data
print("Classification Report for Training Data:")
print(classification_report(np.argmax(y_train, axis=1), y_train_pred_classes))

# Compute F1 score for each class
f1_score_positive = f1_score(np.argmax(y_train, axis=1), y_train_pred_classes, average=None)[1]
f1_score_negative = f1_score(np.argmax(y_train, axis=1), y_train_pred_classes, average=None)[0]

print("F1 Score for Positive:", f1_score_positive)
print("F1 Score for Negative:", f1_score_negative)

accuracy = accuracy_score(np.argmax(y_train, axis=1), y_train_pred_classes)
print("Accuracy:", accuracy)
11284/11284 [==============================] - 1296s 115ms/step
AUC for Positive Class: 0.9900555967282751
AUC for Negative Class: 0.9720893999350884
Confusion Matrix for Training Data:
[[ 17261   5036    781   3392    769]
 [  1908  30444    254    914    552]
 [   482    472  37956   4800   7444]
 [  3120   1486   3503  50764   2784]
 [   217    384   3550   1183 181608]]
Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.75      0.63      0.69     27239
           1       0.80      0.89      0.85     34072
           2       0.82      0.74      0.78     51154
           3       0.83      0.82      0.83     61657
           4       0.94      0.97      0.96    186942

    accuracy                           0.88    361064
   macro avg       0.83      0.81      0.82    361064
weighted avg       0.88      0.88      0.88    361064

F1 Score for Positive: 0.8469135115586837
F1 Score for Negative: 0.6873195691560315
Accuracy: 0.8808216825825892

Ploting ROC cruve on traing data

In [16]:
# Compute predicted probabilities for each class
y_scores_proba_train = model.predict(X_train)

# Compute ROC curve and ROC area for each class on training set
fpr_train = dict()
tpr_train = dict()
roc_auc_train = dict()

for i in range(num_classes):
    fpr_train[i], tpr_train[i], _ = roc_curve(y_train[:, i], y_scores_proba_train[:, i])
    roc_auc_train[i] = auc(fpr_train[i], tpr_train[i])

# Plot ROC curve for training set
plt.figure(figsize=(8, 6))
for i in range(num_classes):
    plt.plot(fpr_train[i], tpr_train[i], label=f'Class {i} (AUC = {roc_auc_train[i]:0.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Model (Training Set)')
plt.legend(loc="lower right")
plt.show()
11284/11284 [==============================] - 1310s 116ms/step

Testing Data Check

In [17]:
# Generate predictions for testing data
y_test_pred = model.predict(X_test)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)
# Compute AUC for each class
auc_positive = roc_auc_score(y_test[:, 1], y_test_pred[:, 1])
auc_negative = roc_auc_score(y_test[:, 0], y_test_pred[:, 0])
print("AUC for Positive Class:", auc_positive)
print("AUC for Negative Class:", auc_negative)
# Construct confusion matrix for testing data
cm_test = confusion_matrix(np.argmax(y_test, axis=1), y_test_pred_classes)
print("Confusion Matrix for Testing Data:")
print(cm_test)

# Classification Report for testing data
print("Classification Report for Testing Data:")
print(classification_report(np.argmax(y_test, axis=1), y_test_pred_classes))
# Compute F1 score for each class
f1_score_positive = f1_score(np.argmax(y_test, axis=1), y_test_pred_classes, average=None)[1]
f1_score_negative = f1_score(np.argmax(y_test, axis=1), y_test_pred_classes, average=None)[0]

print("F1 Score for Positive:", f1_score_positive)
print("F1 Score for Negative:", f1_score_negative)
accuracy = accuracy_score(np.argmax(y_test, axis=1), y_test_pred_classes)
print("Accuracy:", accuracy)
2821/2821 [==============================] - 327s 116ms/step
AUC for Positive Class: 0.9801403752468922
AUC for Negative Class: 0.9496297936299771
Confusion Matrix for Testing Data:
[[ 3933  1195   306  1100   283]
 [ 1045  6761   119   345   214]
 [  217   160  8362  1603  2508]
 [  917   405  1255 11701  1080]
 [   96   133  2312   783 43434]]
Classification Report for Testing Data:
              precision    recall  f1-score   support

           0       0.63      0.58      0.60      6817
           1       0.78      0.80      0.79      8484
           2       0.68      0.65      0.66     12850
           3       0.75      0.76      0.76     15358
           4       0.91      0.93      0.92     46758

    accuracy                           0.82     90267
   macro avg       0.75      0.74      0.75     90267
weighted avg       0.82      0.82      0.82     90267

F1 Score for Positive: 0.789006885284164
F1 Score for Negative: 0.603915547024952
Accuracy: 0.8219061229463702

Ploting ROC cruve on testing data

In [18]:
# Compute predicted probabilities for each class on testing data
y_scores_proba_test = model.predict(X_test)

# Compute ROC curve and ROC area for each class on testing set
fpr_test = dict()
tpr_test = dict()
roc_auc_test = dict()

for i in range(num_classes):  # Assuming num_classes is defined
    fpr_test[i], tpr_test[i], _ = roc_curve(y_test[:, i], y_scores_proba_test[:, i])
    roc_auc_test[i] = auc(fpr_test[i], tpr_test[i])

# Plot ROC curve for testing set
plt.figure(figsize=(8, 6))
for i in range(num_classes):  # Assuming num_classes is defined
    plt.plot(fpr_test[i], tpr_test[i], label=f'Class {i} (AUC = {roc_auc_test[i]:0.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Model (Testing Set)')
plt.legend(loc="lower right")
plt.show()
2821/2821 [==============================] - 322s 114ms/step

top 2 features with the highest weights

In [19]:
# Get the weights of the embedding layer
embedding_weights = model.layers[0].get_weights()[0]

# Get the word index
word_index = tokenizer.word_index

# Calculate the norms of the embedding vectors
embedding_norms = np.linalg.norm(embedding_weights, axis=1)

# Get the indices of the top 2 features with the highest norms
top_features_indices = np.argsort(embedding_norms)[-2:]

# Print the top 2 features with their corresponding weights
print("Top 2 Features with the Highest Weights:")
for i, index in enumerate(top_features_indices):
    word = list(word_index.keys())[list(word_index.values()).index(index)]
    weight = embedding_norms[index]
    print(f"Feature: {word}, Weight: {weight:.4f}")
Top 2 Features with the Highest Weights:
Feature: perfectly, Weight: 1.7785
Feature: but, Weight: 2.0063

hyperparameter tuning on 5000 data sample¶

In [20]:
# Define the number of data points to keep for each sentiment class
num_data_per_sentiment = 5000

# Filter the data to keep only 5000 data points for each sentiment class
X_filtered = []
y_filtered = []
for sentiment, data in zip(y, X_padded):
    sentiment_label = sentiment_mapping[sentiment]
    if y_filtered.count(sentiment_label) < num_data_per_sentiment:
        X_filtered.append(data)
        y_filtered.append(sentiment_label)

# Convert the filtered data to numpy arrays
X_filtered = np.array(X_filtered)
y_filtered = np.array(y_filtered)

# Convert the labels to categorical format
y_categorical_filtered = to_categorical(y_filtered)

# Train-test split on the filtered dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_categorical_filtered, test_size=0.2, random_state=42)

# Print the shapes of the filtered datasets
print("Shapes of Filtered Data:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# Define hyperparameter combinations for each iteration
hyperparameters = [
    {'units': 64, 'dropout_rate': 0.3},
    {'units': 128, 'dropout_rate': 0.5},
    {'units': 256, 'dropout_rate': 0.7}
]

# List to store validation accuracies for each iteration
validation_accuracies = []

# Ensure that all indices in X_train and X_test are within the range [0, num_words - 1]
X_train_clipped = np.clip(X_train, 0, num_words - 1)
X_test_clipped = np.clip(X_test, 0, num_words - 1)

# Perform hyperparameter tuning for each iteration
for i, params in enumerate(hyperparameters, start=1):
    print(f"Iteration {i}: Hyperparameters - {params}")

    # Define model architecture with current hyperparameters
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=embedding_dim))
    model.add(Bidirectional(LSTM(units=params['units'], return_sequences=True)))
    model.add(Dropout(params['dropout_rate']))
    model.add(Bidirectional(LSTM(units=int(params['units'] / 2))))
    model.add(Dropout(params['dropout_rate']))
    model.add(Dense(units=num_classes, activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train_clipped, y_train, epochs=1, batch_size=128, validation_split=0.2, verbose=0)

    # Get validation accuracy from training history
    validation_accuracy = history.history['val_accuracy'][0]
    validation_accuracies.append(validation_accuracy)

    print(f"Validation Accuracy for Iteration {i}: {validation_accuracy}")

# Print the validation accuracies for each iteration
for i, acc in enumerate(validation_accuracies, start=1):
    print(f"Iteration {i} Validation Accuracy: {acc}")
Shapes of Filtered Data:
X_train: (20000, 83)
X_test: (5000, 83)
y_train: (20000, 5)
y_test: (5000, 5)
Iteration 1: Hyperparameters - {'units': 64, 'dropout_rate': 0.3}
Validation Accuracy for Iteration 1: 0.46799999475479126
Iteration 2: Hyperparameters - {'units': 128, 'dropout_rate': 0.5}
Validation Accuracy for Iteration 2: 0.484250009059906
Iteration 3: Hyperparameters - {'units': 256, 'dropout_rate': 0.7}
Validation Accuracy for Iteration 3: 0.49549999833106995
Iteration 1 Validation Accuracy: 0.46799999475479126
Iteration 2 Validation Accuracy: 0.484250009059906
Iteration 3 Validation Accuracy: 0.49549999833106995

cross-validation for 3 fold on sample data 5000

In [21]:
# Define KFold cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Initialize list to store accuracy scores
cv_scores = []

# Iterate over the cross-validation splits
for train_index, val_index in kf.split(X_train_clipped):
    # Define model architecture
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=embedding_dim))
    model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(units=64)))
    model.add(Dropout(0.5))
    model.add(Dense(units=num_classes, activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Fit the model on the training data for this fold
    model.fit(X_train_clipped, y_train, epochs=1, batch_size=128, verbose=0)

    # Generate predictions for the validation fold
    val_predictions = model.predict(X_train_clipped)

    # Convert predictions to class labels
    val_pred_classes = np.argmax(val_predictions, axis=1)

    # Calculate accuracy for the validation fold
    val_accuracy = accuracy_score(np.argmax(y_train, axis=1), val_pred_classes)

    # Append accuracy to list
    cv_scores.append(val_accuracy)

# Convert list to numpy array
cv_scores = np.array(cv_scores)

# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)

# Calculate and print mean cross-validation accuracy
mean_cv_accuracy = np.mean(cv_scores)
print("Mean CV Accuracy:", mean_cv_accuracy)
625/625 [==============================] - 73s 114ms/step
625/625 [==============================] - 79s 120ms/step
625/625 [==============================] - 79s 124ms/step
Cross-validation Scores: [0.6047  0.59495 0.59995]
Mean CV Accuracy: 0.5998666666666667
In [24]:
pip install lime
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 275.7/275.7 kB 4.9 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from lime) (3.7.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from lime) (1.25.2)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from lime) (1.11.4)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from lime) (4.66.2)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.10/dist-packages (from lime) (1.2.2)
Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.10/dist-packages (from lime) (0.19.3)
Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (3.2.1)
Requirement already satisfied: pillow!=7.1.0,!=7.1.1,!=8.3.0,>=6.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (9.4.0)
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2.31.6)
Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2024.2.12)
Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (1.5.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (24.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (3.3.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.2.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (4.49.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.4.5)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... done
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=d2d22c4dd7d3b5dfc9787b2e31bc5ec08502bfe8a560cbaa74d83cb807834c06
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1

Model Interpretability using LIME

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from lime.lime_text import LimeTextExplainer

# Take a subset of 5000 data samples
X_subset = X[:5000]
y_subset = y[:5000]

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_subset)
X_sequences = tokenizer.texts_to_sequences(X_subset)

# Pad sequences
max_sequence_length = max([len(seq) for seq in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

# Define sentiment mapping
# Define a dictionary mapping numerical sentiment to its corresponding label
sentiment_mapping = {'Strong_Pos': 4, 'Neutral': 3, 'Mild_Pos': 2, 'Strong_Neg': 1, 'Mild_Neg': 0}
y_categorical = [sentiment_mapping[sentiment] for sentiment in y_subset]
y_categorical = to_categorical(y_categorical)

# Define vocabulary size (number of unique words)
num_words = len(tokenizer.word_index) + 1  # Add 1 for the padding token
# Define number of classes
num_classes = len(sentiment_mapping)


# Create LimeTextExplainer
explainer = LimeTextExplainer(class_names=list(sentiment_mapping.keys()))

# Choose a sample for explanation (you can change the sample_index)
sample_index = 0
sample_text = X_subset[sample_index]
true_sentiment = y_categorical[sample_index]

# Define a function to predict probabilities using the model
def predict_proba(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
    return model.predict(padded_sequences)
# Predict sentiment probabilities
predicted_probabilities = predict_proba([sample_text])[0]
# Define a dictionary mapping numerical sentiment to its corresponding label
sentiment_labels = {0: 'Mild_Neg', 1: 'Strong_Neg', 2: 'Mild_Pos', 3: 'Neutral', 4: 'Strong_Pos'}
# Get the label of the predicted sentiment
predicted_sentiment_label = sentiment_labels[np.argmax(predicted_probabilities)]



# Generate local explanation
explanation = explainer.explain_instance(sample_text, predict_proba, num_features=10)

# Print true sentiment, predicted sentiment, and local explanation
#print("True Sentiment:", true_sentiment_label)
print("Predicted Sentiment:", predicted_sentiment_label)
explanation.show_in_notebook(text=True)
1/1 [==============================] - 0s 144ms/step
157/157 [==============================] - 21s 132ms/step
Predicted Sentiment: Mild_Pos
In [ ]:

In [31]:
# Iterate over each sentiment category
for label in sentiment_mapping:
    print("Sentiment:", label)

    # Get the indices of samples with the current sentiment label
    indices = np.where(y_subset == label)[0]

    if len(indices) > 0:
        # Print explanations for the first 5 samples
        count = 0
        for sample_index in indices:
            sample_text = X_subset[sample_index]
            true_sentiment = y_categorical[sample_index]

            # Generate explanation for the sample
            explanation = explainer.explain_instance(sample_text, predict_proba, num_features=10)

            # Print the sample text
            print("Sample Text:", sample_text)

            # Print the explanation
            explanation.show_in_notebook(text=True)

            count += 1
            if count == 2:
                break
    else:
        print("No samples found for this sentiment category.")
Sentiment: Strong_Pos
157/157 [==============================] - 14s 87ms/step
Sample Text: Freedom Convoy as InkBlot Test https://t.co/auLrduDpdI
157/157 [==============================] - 16s 101ms/step
Sample Text: @mark_slapinski Well it’s pretty easy to see what their agenda is and Pierre has remained silent on the issues and he never actually fought for the convoy just did a photo op
Sentiment: Neutral
157/157 [==============================] - 20s 126ms/step
Sample Text: @JustinTrudeau You Belong In Jail.
#VaccineMandates #CrimesAgainstHumanity #TrudeauDictatorship
#FreedomConvoy
https://t.co/HrsYk2IYXC
157/157 [==============================] - 20s 129ms/step
Sample Text: #FreeDumbConvoy #FreedomConvoy #Freedumbers #freedumb #freedom
Sentiment: Mild_Pos
157/157 [==============================] - 23s 144ms/step
Sample Text: @_angelica_toy Happy Anniversary!!!....The Day the FreeDUMB Died (In the tune of Don McLean's "American Pie") #FreeDumbConvoy #Freedumbers #FluTruxKlan #convoywatch #convoy #FreedomConvoy   https://t.co/ZT1cIPwmh9
157/157 [==============================] - 24s 156ms/step
Sample Text: @McfarlaneGlenda Happy Anniversary!!!....The Day the FreeDUMB Died (In the tune of Don McLean's "American Pie") #FreeDumbConvoy #Freedumbers #FluTruxKlan #convoywatch #convoy #FreedomConvoy   https://t.co/ZT1cIPwmh9
Sentiment: Strong_Neg
157/157 [==============================] - 25s 160ms/step
Sample Text: @brethordark The #FreedomConvoy 1 year Anniversary... they don't like FREEDOM or HONKING!!!
157/157 [==============================] - 25s 158ms/step
Sample Text: #Freedumbers partied as they caused fellow Canadians to be prisoners in their own homes

When the pandemic came, their inconvenience was much more important, any lives lost be damned

But #TrudeauIsAPsychopath? Ć°ÅøĀ¤ā€

Nah

#FreeDumbConvoy #cdnpoli #FreedomConvoy #TrudeauWasRight
Sentiment: Mild_Neg
157/157 [==============================] - 25s 158ms/step
Sample Text: @FightHaven Those knee drops remind me of something...

Oh right. 
Trudeau's crackdown on the #FreedomConvoy Trucker's protest.
157/157 [==============================] - 24s 155ms/step
Sample Text: @AndreLemelin4 We are a sovereign nation with a democratically elected government - planting another country’s flag in front of our federal parliament (particularly in relation to this “freedom convoy�) strongly implies treasonous intent!
In [ ]:

In [ ]:

In [ ]:

In [30]:
!jupyter nbconvert --to html Copy of NLP_3_Q-3.ipynb
[NbConvertApp] WARNING | pattern 'Copy' matched no files
[NbConvertApp] WARNING | pattern 'of' matched no files
[NbConvertApp] WARNING | pattern 'NLP_3_Q-3.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
        to various other formats.

        WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.

Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
    Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
    Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
    read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
    Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
    Write notebook output to stdout instead of files.
    Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
    Run nbconvert in place, overwriting the existing notebook (only
            relevant when converting to notebook format)
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
    Clear output of current file and save in place,
            overwriting the existing notebook.
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
    Exclude input and output prompts from converted document.
    Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
    Exclude input cells and output prompts from converted document.
            This mode is ideal for generating code-free reports.
    Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
    Whether to allow downloading chromium if no suitable version is found on the system.
    Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
    Disable chromium security sandbox when converting to PDF..
    Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
    Shows code input. This flag is only useful for dejavu users.
    Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
    Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
    Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
    Whether the HTML in Markdown cells and cell outputs should be sanitized..
    Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
    Set the log level by value or name.
    Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
    Default: 30
    Equivalent to: [--Application.log_level]
--config=<Unicode>
    Full path of a config file.
    Default: ''
    Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
    The export format to be used, either one of the built-in formats
            ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
            or a dotted object name that represents the import path for an
            ``Exporter`` class
    Default: ''
    Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
    Name of the template to use
    Default: ''
    Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
    Name of the template file to use
    Default: None
    Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
    Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
    as prebuilt extension for the lab template)
    Default: 'light'
    Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
    Whether the HTML in Markdown cells and cell outputs should be sanitized.This
    should be set to True by nbviewer or similar tools.
    Default: False
    Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
    Writer class used to write the
                                        results of the conversion
    Default: 'FilesWriter'
    Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
    PostProcessor class used to write the
                                        results of the conversion
    Default: ''
    Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
    overwrite base name use for output files.
                can only be used when converting one notebook at a time.
    Default: ''
    Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
    Directory to write output(s) to. Defaults
                                  to output to the directory of each notebook. To recover
                                  previous default behaviour (outputting to the current
                                  working directory) use . as the flag value.
    Default: ''
    Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
    The URL prefix for reveal.js (version 3.x).
            This defaults to the reveal CDN, but can be any url pointing to a copy
            of reveal.js.
            For speaker notes to work, this must be a relative path to a local
            copy of reveal.js: e.g., "reveal.js".
            If a relative path is given, it must be a subdirectory of the
            current directory (from which the server is run).
            See the usage documentation
            (https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
            for more details.
    Default: ''
    Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
    The nbformat version to write.
            Use this to downgrade notebooks.
    Choices: any of [1, 2, 3, 4]
    Default: 4
    Equivalent to: [--NotebookExporter.nbformat_version]

Examples
--------

    The simplest way to use nbconvert is

            > jupyter nbconvert mynotebook.ipynb --to html

            Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].

            > jupyter nbconvert --to latex mynotebook.ipynb

            Both HTML and LaTeX support multiple output templates. LaTeX includes
            'base', 'article' and 'report'.  HTML includes 'basic', 'lab' and
            'classic'. You can specify the flavor of the format used.

            > jupyter nbconvert --to html --template lab mynotebook.ipynb

            You can also pipe the output to stdout, rather than a file

            > jupyter nbconvert mynotebook.ipynb --stdout

            PDF is generated via latex

            > jupyter nbconvert mynotebook.ipynb --to pdf

            You can get (and serve) a Reveal.js-powered slideshow

            > jupyter nbconvert myslides.ipynb --to slides --post serve

            Multiple notebooks can be given at the command line in a couple of
            different ways:

            > jupyter nbconvert notebook*.ipynb
            > jupyter nbconvert notebook1.ipynb notebook2.ipynb

            or you can specify the notebooks list in a config file, containing::

                c.NbConvertApp.notebooks = ["my_notebook.ipynb"]

            > jupyter nbconvert --config mycfg.py

To see all available configurables, use `--help-all`.